WRANGLING

knitr::opts_chunk$set(error = FALSE,
                      tidy = TRUE,
                      # tidy.opts = list(width.cutoff = 60),
                      message = FALSE,
                      warning = FALSE)
library(tidyverse)
library(readxl)
library(readr)
library(haven)
library(knitr)
library(kableExtra)
library(lubridate)
library(labelled)
library(gtsummary)
library(data.table)
# Working directory saved as `wd` and data path as `data_path`
wd <- getwd()
data_path <- paste(wd, "Data", sep = "/")

Identification numbers

id_sid_matchup

# Import & clean id_sid_matchup2.sas7bdat
id_sid_matchup2 <- read_sas(paste(data_path, "ids", "id_sid_matchup2.sas7bdat", sep = "/"))
id_sid_clean <- id_sid_matchup2 %>%
    transmute(sid, newsid = as.character(newsid), shift_num = str_extract(shift_num_ampm,
        "[:digit:]+"), date = d8, am_pm = str_extract(shift_num_ampm, "am|pm"), shift_start_dttm = startd8time,
        shift_start_tm = ShiftStart, shift_end_dttm = ymd_hms(paste(date, ShiftEnd)),
        shift_end_tm = ShiftEnd, staff = as_factor(staff))

# preview id_sid_match_cleaned
rmarkdown::paged_table(id_sid_clean)
# Save id_sid_clean to files in csv and Rdata format write_csv(id_sid_clean,
# path = paste(data_path, 'Rdata', 'id_sid_clean.Rdata', sep = '/'))
# write_csv(id_sid_clean, path = paste(data_path, 'csv_files',
# 'id_sid_clean.csv', sep = '/'))

id_final

ids_final <- read_sas(paste(data_path, "ids", "finalsids6944.sas7bdat", sep = "/"))
# preview ids_final (no cleaning needed)
rmarkdown::paged_table(ids_final)
# Check if sids repeate within shifts:
ids_final %>%
    group_by(shift_num) %>%
    count(sid) %>%
    filter(n > 1) %>%
    rmarkdown::paged_table()

pt_sids

pt_sids <- read_sas(paste(data_path, "ids", "patientSIDs_in_completepat.sas7bdat",
    sep = "/"))
# preview unclean pt_sids glimpse(pt_sids)
pt_sid_clean <- pt_sids %>%
    transmute(sid, shift_num = str_extract(shift_num_ampm, "[:digit:]+"), shift_ampm = as_factor(str_extract(shift_num_ampm,
        "am|pm")))
# preview pt_sid_clean
rmarkdown::paged_table(pt_sid_clean)
pt_sid_clean %>%
    group_by(shift_num) %>%
    count(shift_ampm) %>%
    arrange(shift_num) %>%
    rmarkdown::paged_table()
pt_sid_clean %>%
    count(shift_ampm) %>%
    rmarkdown::paged_table()
# Save to file write_rds(pt_sid_clean, path = paste(data_path, 'Rdata',
# 'pt_sid_clean.Rdata', sep = '/')) write_csv(pt_sid_clean, path =
# paste(data_path, 'csv_files', 'pt_sid_clean.csv', sep = '/'))
# generate_dictionary(pt_sid_clean)

##staff_sids

staff_sids <- read_sas(paste(data_path, "ids", "staffsids_in_eventfile.sas7bdat",
    sep = "/"))
str(staff_sids)
## tibble [95 × 5] (S3: tbl_df/tbl/data.frame)
##  $ sid            : chr [1:95] "0002f35c" "0002f43c" "0002f445" "0002f447" ...
##  $ job            : chr [1:95] "" "" "" "" ...
##  $ job_SAH        : chr [1:95] "STAFF" "RN" "STAFF" "RN" ...
##  $ ParticipantCat4: chr [1:95] "STAFF" "RN" "STAFF" "RN" ...
##   ..- attr(*, "label")= chr "MD/RN/Staff/Patient"
##  $ JOBTITLE       : chr [1:95] "STAFF" "RN" "STAFF" "RN" ...
staff_sids_clean <- staff_sids %>%
    mutate(across(.cols = !sid, .fns = as.factor)) %>%
    select(-job)
rmarkdown::paged_table(staff_sids_clean)
staff_sids_clean %>%
    count(ParticipantCat4) %>%
    rmarkdown::paged_table()
# Save to file write_csv(staff_sids_clean, file = paste(data_path, 'Rdata',
# 'staff_sids_clean.Rdata', sep = '/')) write_csv(staff_sids_clean, file =
# paste(data_path, 'csv_files', 'staff_sids_clean.csv', sep = '/'))
# generate_dictionary(staff_sids_clean)

allstaff

allstaff <- read_sas(paste(data_path, "staff", "allstaff_numbyshift.sas7bdat", sep = "/"))
# str(allstaff)
allstaff_clean <- allstaff %>%
    mutate(job_title = as_factor(JOBTITLE), .keep = "unused")
# write_rds(allstaff_clean, paste(data_path, 'Rdata', 'allstaff_clean.Rdata',
# sep = '/')) write_csv(allstaff_clean, paste(data_path, 'csv_files',
# 'allstaff_clean.csv', sep = '/'))
rmarkdown::paged_table(allstaff_clean)
# generate_dictionary(allstaff_clean)

ntab2

ntab2 <- read_csv(file = paste(data_path, "ids", "nTab2.csv", sep = "/"))
# glimpse(ntab2)
ntab2_clean <- ntab2 %>%
    transmute(observation = obs, sid = sids, id = as.character(ID), date = dmy(D8),
        shift_num = shiftnum, shift_person = person, staff = as_factor(staff), degree,
        wdegree, WtAlpha)

rmarkdown::paged_table(ntab2_clean)
# save to file write_rds(ntab2_clean, file = paste(data_path, 'Rdata',
# 'ntab2_clean.Rdata', sep = '/')) write_csv(ntab2_clean, file =
# paste(data_path, 'csv_files', 'ntab2_clean.csv', sep = '/'))
# generate_dictionary(ntab2_clean)

ntab2 lists 6944 observations inclusive of all shifts. These observations list up to - individuals per shift, all 81 shifts.

Shift Info

shift_times

shift_times <- read_xls(paste(data_path, "shifts", "Shift Start End times v3.xls",
    sep = "/"), col_names = c("date", "shift_start_tm", "shift_end_tm", "shift_ampm",
    "comments"), col_types = c("date", "guess", "guess", "text", "text", "skip",
    "skip", "skip", "skip", "skip", "skip"), skip = 1)
# shift_times %>% str()
rmarkdown::paged_table(shift_times)
# generate_dictionary(shift_times) write_csv(shift_times, file =
# paste(data_path, 'csv_files', 'shift_times.csv', sep = '/'))
# write_rds(shift_times, file = paste(data_path, 'Rdata', 'shift_times.Rdata',
# sep = '/'))

counts_per_shift

counts_per_shift <- read_sas(paste(data_path, "shifts", "countspershift.sas7bdat",
    sep = "/"))
# str(counts_per_shift)
countspershift_clean <- counts_per_shift %>%
    transmute(shift_num, date = d8, am_pm = str_extract(shift_num_ampm, "am|pm"),
        staffcount = as.integer(staffcount), patcount = as.integer(patcount), total = as.integer(total))
rmarkdown::paged_table(countspershift_clean)
# write_rds(countspershift_clean, file = paste(data_path, 'Rdata',
# 'countspershift_clean.Rdata', sep = '/')) write_csv(countspershift_clean,
# file = paste(data_path, 'csv_files', 'countspershift_clean.csv', sep = '/'))

person_dataset

person_dataset <- read_sas(paste(data_path, "patients", "person_dataset.sas7bdat",
    sep = "/"))

person_dataset <- person_dataset %>%
    transmute(shift_num = as.integer(shiftnum), person = as.character(person), staff = as_factor(staff),
        degree, minutes)
rmarkdown::paged_table(person_dataset)
# generate_dictionary(person_dataset) write_rds(person_dataset,
# paste(data_path, 'Rdata', 'person_dataset.Rdata', sep = '/'))
# write_csv(person_dataset, paste(data_path, 'csv_files', 'person_dataset.csv',
# sep = '/'))

Room Info

location_categories

location_cats <- read_sas(paste(data_path, "rooms", "location_categories_list.sas7bdat",
    sep = "/"))
rmarkdown::paged_table(location_cats)

location_defs

location_defs <- read_sas(paste(data_path, "rooms", "location_definitions.sas7bdat",
    sep = "/"))
rmarkdown::paged_table(location_defs)
# generate_dictionary(location_defs)

room_locs

room_locs <- read_xlsx(paste(data_path, "rooms", "Room Locations and Square Footage - with Correction.xlsx",
    sep = "/"), col_names = c("id", "name", "type", "sqft_area"), col_types = c("text",
    "text", "text", "numeric", "skip"), skip = 1)
rmarkdown::paged_table(room_locs)

locations_list

locations_list <- read_sas(paste(data_path, "rooms", "locationslist.sas7bdat", sep = "/"))
rmarkdown::paged_table(locations_list)

Patient Data

Patient Demographics

Demographic information with med_rec numbers and encounter numbers. Note that med_rec numbers may repeat!

demographics_df

demographic_df <- read_csv(paste(data_path, "patients", "Population_Patient_Demographics.csv", sep = "/"))
# glimpse(demographic_df)
demographics_clean <- demographic_df %>%
  transmute(med_rec_no = as.character(`Medical Record Number`),
         encounter_no = as.character(`Enc .`),
         arrival_date = mdy(`ED Arrival Timestamp`), 
         departure_date = mdy(`ED Departure Timestamp`),
         # across(!where(is.Date)
         #        & !where(is.double)
         #        & !where(is.integer)
         #        & !med_rec_no
         #        & !encounter_no
         #        & !`ED Arrival Timestamp`
         #        & !`ED Departure Timestamp`
         #        & !`Chief Complaint`
         #        & !`% ED Patients Admitted as Inpt or Obs`, 
         #        .fns = as_factor),
         sex = as_factor(Sex, ),
         age = `Patient Age at Visit`,
         race = as_factor(Race),
         chief_complaint = `Chief Complaint`,
         acuity = as_factor(Acuity),
         arrival_mode = as_factor(`Arr Mode`),
         disposition = as_factor(`ED Disposition`)) %>% 
  # str()
# demographics_clean_test %>% 
  mutate(across(where(is.factor), ~recode(.x, `Not Recorded` = NA_character_, `Unknown` = NA_character_))) %>% 
  mutate(acuity = fct_relevel(acuity,
                              "1 Immediate",
                              "2 Emergent",
                              "3 Urgent",
                              "4 Stable",
                              "5 Non Urgent"),
         arrival_cat = fct_recode(arrival_mode,
                                  "EMS" = "EMS Ground",
                                  "EMS" = "EMS Air",
                                  "Custody" = "Police/Jail",
                                  "Ambulatory" = "Private Vehicle",
                                  "Ambulatory" = "Public Trans", 
                                  "Ambulatory" = "Walk"),
         race = fct_relevel(race,
                            "Black",
                            "White",
                            "Hispanic",
                            "Asian",
                            "Other",
                            "1",
                            "2",
                            "3"), .after = 10)
# generate_dictionary(demographics_clean_test)

# generate_dictionary(demographics_clean)

rmarkdown::paged_table(demographics_clean)
demographics_clean %>% 
  count(race) %>% 
  rmarkdown::paged_table()
demographics_clean %>% 
  group_by(acuity) %>% 
  summarise(`Total (n)` = n(), 
            `Black (n)` = sum(race == "Black", na.rm = TRUE),
            `Black (%)` = round((`Black (n)`/`Total (n)`)*100, 2),
            `White (n)` = sum(race == "White", na.rm = TRUE),
            `White (%)` = round((`White (n)`/`Total (n)`)*100, 2),
            `Hispanic (n)` = sum(race == "Hispanic", na.rm = TRUE),
            `Hispanic (%)` = round((`Hispanic (n)`/`Total (n)`) * 100, 2),
            `Other (n)` = sum(race == "Other", na.rm = TRUE),
            `Other (%)` = round((`Other (n)`/`Total (n)`) * 100, 2),
            `Missing (n)` = sum(is.na(race)),
            `Missing (%)` = round((`Missing (n)`/`Total (n)`) * 100, 2)) %>% 
  rename(Acuity = acuity) %>% 
  rmarkdown::paged_table()
# write_rds(demographics_clean, paste(data_path, "Rdata", "demographics_clean.Rdata", sep = "/"))
# write_csv(demographics_clean, paste(data_path, "csv_files", "demographics_clean.csv", sep = "/"))

pop_pt_demographics

pop_pt_demographics <- read_csv(paste(data_path, "patients", "Population_Patient_Demographics.csv",
    sep = "/"))
nms <- c("arrival_date", "med_rec_no", "encounter_no", "sex", "age", "race", "chief_complaint",
    "acuity", "arrival_mode", "departure_date", "disposition", "prop_admitted")
names(pop_pt_demographics) <- nms
# str(pop_pt_demographics) generate_dictionary(pop_pt_demographics)
pop_pt_demo_clean <- pop_pt_demographics %>%
    mutate(med_rec_no = as.character(med_rec_no), encounter_no = as.character(encounter_no),
        arrival_date = mdy(arrival_date), .after = 3, departure_date = mdy(departure_date),
        sex = as_factor(sex), race = as_factor(race), acuity = as_factor(acuity),
        arrival_mode = as_factor(arrival_mode)) %>%
    mutate(across(where(is.factor), ~recode(.x, `Not Recorded` = NA_character_, Unknown = NA_character_))) %>%
    mutate(acuity = fct_relevel(acuity, "1 Immediate", "2 Emergent", "3 Urgent",
        "4 Stable", "5 Non Urgent"), arrival_cat = fct_recode(arrival_mode, EMS = "EMS Ground",
        EMS = "EMS Air", Custody = "Police/Jail", Ambulatory = "Private Vehicle",
        Ambulatory = "Public Trans", Ambulatory = "Walk"), .after = 9, race = fct_relevel(race,
        "Black", "White", "Hispanic", "Asian", "Other", "1", "2", "3"))

rmarkdown::paged_table(pop_pt_demo_clean)
# generate_dictionary(pop_pt_demo_clean) write_csv(pop_pt_demo_clean, file =
# paste(data_path, 'csv_files', 'pop_pt_demo_clean.csv', sep = '/'))
# write_rds(pop_pt_demo_clean, file = paste(data_path, 'Rdata',
# 'pop_pt_demo_clean.Rdata', sep = '/'))

pts_population

pts_population <- read_sas(paste(data_path, "patients", "Patients_population.sas7bdat",
    sep = "/"))
rmarkdown::paged_table(pts_population)
# generate_dictionary(pts_population) write_csv(pts_population, file =
# paste(data_path, 'csv_files', 'pts_population.csv', sep = '/'))
# write_rds(pts_population, file = paste(data_path, 'Rdata',
# 'pts_population.Rdata', sep = '/'))

pts_pop2

pts_pop2 <- read_sas(paste(data_path, "patients", "patients_population2.sas7bdat",
    sep = "/"))
rmarkdown::paged_table(pts_pop2)
# generate_dictionary(pts_pop2)

# write_csv(pts_pop2, file = paste(data_path, 'csv_files', 'pts_pop2.csv', sep
# = '/')) write_rds(pts_pop2, file = paste(data_path, 'Rdata',
# 'pts_pop2.Rdata', sep = '/'))

“completepat.sas7bdat” contains RFID badge locations (by room number) for each sampled patient for every second they were monitored in the ED.

The file is large, so subset a random sample of observations with slice_sample() to test code for wrangling.

Use pivot_longer() to reshape from wide format to long format by collapsing all location-by-second columns (variables flocX$, where X is the shift time in seconds that the respective SID was identified in room number flocX) into two columns, names set to seconds and values to location.

Note that pivot_longer() will create many duplicate SIDs and many NAs which will require filtering.

Patient RFID Data

pt_complete

pt_complete <- read_sas(paste(data_path, "patients", "completepat.sas7bdat", sep = "/"))
# glimpse(pt_complete)
# 1. subset a random sample of 30 observations for data transformation code preparation
# set.seed(711)
# pt_complete_sample <- pt_complete %>% slice_sample(n = 30)
# pt_complete_sample %>%
pt_complete_long <- pt_complete %>%

# 2. Pivot the data.frame from wide to long by placing all column names that start with "floc" into a new column, "seconds," and placing respective observations for each "floc" variable into a "location_num" column
  pivot_longer(cols = starts_with("floc"), 
               names_to = "shift_second", # moves column names to new column "seconds"
               values_to = "location_num", # moves location numbers to new column "location_num"
               values_drop_na = TRUE) %>% # removes all rows (observations) without location data
# 3. Delete the prefix "floc" from `seconds` and change class to numeric
  mutate(shift_second = as.double(str_replace(shift_second, "floc", "")),
         location_num = as_factor(location_num),
         shift_num_ampm = str_trim(shift_num_ampm),
         am_pm = as_factor(str_extract(shift_num_ampm, "am|pm")),
         shift_num = as.integer(str_extract(shift_num_ampm, "[:digit:]+")),
         date = make_date(year = year, month = mon, day = day)) %>% 
  select(sid, date, shift_num, am_pm, location_num, shift_second) %>% 
  # count(am_pm)
  mutate(day_night = as_factor(if_else(am_pm == "am", "day", "night")),
         .after = 4) # %>% 
  # glimpse()
# Save as .Rdata file
rmarkdown::paged_table(pt_complete_long)
# write_rds(pt_complete_long, file = paste(data_path, "Rdata", "pt_complete_long.Rdata", sep = "/"))
# write_csv(pt_complete_long, file = paste(data_path, "csv_files", "pt_complete_long.csv", sep = "/"))
# generate_dictionary(pt_complete_long)

After transforming & cleaning “completepat.sas7bdat”, pt_complete_long is 7 variables wide and 61342625 rows long.

Staff Data

Staff RFID data

The SAS data file, “completestaff.sas7bdat,” contains RFID badge location (by room number) for all staff each second of every shift.

staff_complete

# 1. pull 'completestaff.sas7bdat' from `sas_data_list` as `tibble()`
staff_complete <- read_sas(paste(data_path, "staff", "completestaff.sas7bdat", sep = "/"))
# Subset a random sample of 30 observations set.seed(321) staff_complete_sample
# <- staff_complete %>% slice_sample(n = 30) 2. Use a random sample of
# staff_complete for data cleaning & transformation
staff_complete_long <- staff_complete %>%
    # 2a. Pivot the data.frame from wide to long by placing all column names
    # that start with 'floc' into a new column, 'seconds,' and placing
    # respective observations for each 'floc' variable into a 'location_num'
    # column
pivot_longer(cols = starts_with("floc"), names_to = "seconds", values_to = "location_num",
    values_drop_na = TRUE) %>%
    # 2b. Remove the prefix 'floc' from `time_seconds` and keep the digits as
    # `seconds`
mutate(seconds = as.double(str_replace(seconds, "floc", "")), location_num = as_factor(location_num),
    shift_num_ampm = str_trim(shift_num_ampm), shift_num = as.integer(str_extract(shift_num_ampm,
        "[:digit:]+")), am_pm = as_factor(str_extract(shift_num_ampm, "am|pm")),
    date = make_date(year = year, month = mon, day = day)) %>%
    select(-d8:-mon)

# staff_complete_sample2 write_rds(staff_complete_long, file = paste(data_path,
# 'Rdata', 'staff_complete_long.Rdata', sep = '/'))
# write_csv(staff_complete_long, file = paste(data_path, 'csv_files',
# 'staff_complete_long.csv', sep = '/')) 3. View data frame structure
# generate_dictionary(staff_complete_long)
rmarkdown::paged_table(staff_complete_long)

Network Data

Whole Networks

net_all_xl

net_all_xl <- read_xlsx(paste(data_path, "networks", "network_allshifts.xlsx", sep = "/"))
str(net_all_xl)
## tibble [46,062 × 17] (S3: tbl_df/tbl/data.frame)
##  $ i         : num [1:46062] 34 34 34 34 34 34 34 34 35 35 ...
##  $ any       : num [1:46062] 1 1 1 1 1 1 1 1 1 1 ...
##  $ staffi    : num [1:46062] 0 0 0 0 0 0 0 0 0 0 ...
##  $ idi       : num [1:46062] 79200934 79200934 79200934 79200934 79200934 ...
##  $ d8        : POSIXct[1:46062], format: "2009-07-09" "2009-07-09" ...
##  $ H1N1      : num [1:46062] 0 0 0 0 0 0 0 0 0 0 ...
##  $ quarter   : num [1:46062] 1 1 1 1 1 1 1 1 1 1 ...
##  $ shiftampm : num [1:46062] 2 2 2 2 2 2 2 2 2 2 ...
##  $ d9        : num [1:46062] 5 5 5 5 5 5 5 5 5 5 ...
##  $ edgeweight: num [1:46062] 0.0639 0.0806 0.2206 0.0706 0.0119 ...
##  $ j         : num [1:46062] 37 66 80 84 85 86 93 98 38 40 ...
##  $ staffj    : num [1:46062] 0 0 0 0 0 0 0 0 0 0 ...
##  $ combo     : num [1:46062] 0 0 0 0 0 0 0 0 0 0 ...
##  $ idj       : num [1:46062] 79200937 79200966 79200980 79200984 79200985 ...
##  $ comboc    : chr [1:46062] "0 patient-patient" "0 patient-patient" "0 patient-patient" "0 patient-patient" ...
##  $ minutes   : num [1:46062] 3.833 4.833 13.233 4.233 0.717 ...
##  $ shiftnum  : num [1:46062] 1 1 1 1 1 1 1 1 1 1 ...
net_all_xl %>%
    rmarkdown::paged_table()

net_all_sas

net_all_sas <- read_sas(paste(data_path, "networks", "network_allshifts.sas7bdat",
    sep = "/"))
# str(net_all_sas)
net_all_sas %>%
    rmarkdown::paged_table()
generate_dictionary(net_all_sas)

Edges

edges_all

edges_all <- read_sas(paste(data_path, "networks", "allshifts_edges.sas7bdat", sep = "/"))
# str(edges_all)
edges_all %>%
    rmarkdown::paged_table()
edges2 <- read_sas(paste(data_path, "networks", "edges2.sas7bdat", sep = "/"))
rmarkdown::paged_table(edges2)

Events

events_long

events_long <- read_sas(paste(data_path, "events", "eventfile_long2.sas7bdat", sep = "/"))
rmarkdown::paged_table(events_long)
# write_csv(events_long, file = paste(data_path, 'csv_files',
# 'events_long.csv', sep = '/'))

events_long has duplicates, use eventfile_nodup if needed.

events_unique

events_unique <- read_sas(paste(data_path, "events", "events_uniquepersid2.sas7bdat",
    sep = "/"))
rmarkdown::paged_table(events_unique)

eventfile_nodup

eventfile_nodup <- read_sas(paste(data_path, "events", "eventfile_long6944_noduploc.sas7bdat",
    sep = "/"))
eventfile_nodup %>%
    rmarkdown::paged_table()
# write_csv(eventfile_nodup, file = paste(data_path, 'csv_files',
# 'eventfile_nodup.csv', sep = '/')) write_rds(eventfile_nodup, file =
# paste(data_path, 'Rdata', 'eventfile_nodup.Rdata', sep = '/'))

Fully Joined Datasets

rdata_dir <- paste0(getwd(), "/Data/Rdata/")

# population.Rdata
population <- read_rds(paste0(rdata_dir, "population.Rdata"))
glimpse(population)
## Rows: 57,514
## Columns: 12
## $ med_rec_no      <chr> "1173785", "870282", "1061988", "856823", "856823", "8…
## $ encounter_no    <chr> "37829225", "38040069", "38229285", "38360060", "38369…
## $ arrival_date    <date> 2009-08-13, 2010-03-10, 2009-10-12, 2010-03-01, 2009-…
## $ departure_date  <date> 2009-08-13, 2010-03-11, 2009-10-12, 2010-03-01, 2009-…
## $ sex             <fct> Female, Female, Male, Female, Female, Female, Male, Fe…
## $ age             <dbl> 47, 49, 57, 44, 43, 44, 50, 50, 42, 42, 70, 70, 54, 90…
## $ race            <fct> Black, Black, Black, Black, Black, Black, Black, Black…
## $ chief_complaint <chr> "Earache", "Assault", "Difficulty Breathing", "Seizure…
## $ acuity          <fct> 4 Stable, 4 Stable, 2 Emergent, 3 Urgent, 2 Emergent, …
## $ arrival_mode    <fct> EMS Ground, Private Vehicle, EMS Ground, Private Vehic…
## $ arrival_cat     <fct> EMS, Ambulatory, EMS, Ambulatory, EMS, Ambulatory, Amb…
## $ disposition     <fct> Discharge, Discharge, Admit, NA, Admit, Discharge, Dis…
# names(population) demographics.Rdata demographics <-
# read_rds(paste0(rdata_dir, 'demographics_clean.Rdata')) glimpse(demographics)

# pts_all.Rdata
pts_all <- read_rds(paste0(rdata_dir, "pts_all.Rdata"))
# glimpse(pts_all) names(pts_all)
test <- population %>%
    full_join(pts_all, by = c("encounter_no", "sex", "age", "race", "chief_complaint",
        "acuity", "arrival_mode", "arrival_cat", "disposition"))
glimpse(test)
## Rows: 58,215
## Columns: 35
## $ med_rec_no        <chr> "1173785", "870282", "1061988", "856823", "856823", …
## $ encounter_no      <chr> "37829225", "38040069", "38229285", "38360060", "383…
## $ arrival_date      <date> 2009-08-13, 2010-03-10, 2009-10-12, 2010-03-01, 200…
## $ departure_date    <date> 2009-08-13, 2010-03-11, 2009-10-12, 2010-03-01, 200…
## $ sex               <fct> Female, Female, Male, Female, Female, Female, Male, …
## $ age               <dbl> 47, 49, 57, 44, 43, 44, 50, 50, 42, 42, 70, 70, 54, …
## $ race              <fct> Black, Black, Black, Black, Black, Black, Black, Bla…
## $ chief_complaint   <chr> "Earache", "Assault", "Difficulty Breathing", "Seizu…
## $ acuity            <fct> 4 Stable, 4 Stable, 2 Emergent, 3 Urgent, 2 Emergent…
## $ arrival_mode      <fct> EMS Ground, Private Vehicle, EMS Ground, Private Veh…
## $ arrival_cat       <fct> EMS, Ambulatory, EMS, Ambulatory, EMS, Ambulatory, A…
## $ disposition       <chr> "Discharge", "Discharge", "Admit", NA, "Admit", "Dis…
## $ sid               <chr> NA, NA, "002167be", NA, NA, NA, NA, NA, NA, NA, NA, …
## $ date              <date> NA, NA, 2009-10-12, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ shift_num         <chr> NA, NA, "96", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ shift_ampm        <fct> NA, NA, am, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ encounter_no2     <chr> NA, NA, "38229285", NA, NA, NA, NA, NA, NA, NA, NA, …
## $ pt_id             <chr> NA, NA, "3", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ age2              <dbl> NA, NA, 57, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ arrival_dttm      <dttm> NA, NA, 2009-10-12 03:49:59, NA, NA, NA, NA, NA, NA…
## $ arrival_tm        <time>              NA,              NA, 03:50:00.000000, …
## $ departure_dttm    <dttm> NA, NA, 2009-10-12 09:07:00, NA, NA, NA, NA, NA, NA…
## $ departure_tm      <time>              NA,              NA, 09:07:00.000000, …
## $ time2tag          <dbl> NA, NA, 197, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ start_date        <dttm> NA, NA, 2009-10-12 07:15:00, NA, NA, NA, NA, NA, NA…
## $ end_date          <dttm> NA, NA, 2009-10-12 19:00:00, NA, NA, NA, NA, NA, NA…
## $ los               <dbl> NA, NA, 317.7333, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ los_minutes       <dbl> NA, NA, 317, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ los_hours         <dbl> NA, NA, 5.283333, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ job               <chr> NA, NA, "PATIENT", NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ approached        <fct> NA, NA, Yes, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ participant       <fct> NA, NA, Yes, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ pt_with_data      <fct> NA, NA, Yes, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ participant_final <fct> NA, NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ nested_loc        <list> <NULL>, <NULL>, [<tbl_df[6765 x 2]>], <NULL>, <NULL…
all_pts <- test %>%
    unnest(cols = "nested_loc", keep_empty = TRUE)  # %>%
# glimpse() pts_all == pts_nested
glimpse(all_pts)
## Rows: 62,044,002
## Columns: 36
## $ med_rec_no        <chr> "1173785", "870282", "1061988", "1061988", "1061988"…
## $ encounter_no      <chr> "37829225", "38040069", "38229285", "38229285", "382…
## $ arrival_date      <date> 2009-08-13, 2010-03-10, 2009-10-12, 2009-10-12, 200…
## $ departure_date    <date> 2009-08-13, 2010-03-11, 2009-10-12, 2009-10-12, 200…
## $ sex               <fct> Female, Female, Male, Male, Male, Male, Male, Male, …
## $ age               <dbl> 47, 49, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, …
## $ race              <fct> Black, Black, Black, Black, Black, Black, Black, Bla…
## $ chief_complaint   <chr> "Earache", "Assault", "Difficulty Breathing", "Diffi…
## $ acuity            <fct> 4 Stable, 4 Stable, 2 Emergent, 2 Emergent, 2 Emerge…
## $ arrival_mode      <fct> EMS Ground, Private Vehicle, EMS Ground, EMS Ground,…
## $ arrival_cat       <fct> EMS, Ambulatory, EMS, EMS, EMS, EMS, EMS, EMS, EMS, …
## $ disposition       <chr> "Discharge", "Discharge", "Admit", "Admit", "Admit",…
## $ sid               <chr> NA, NA, "002167be", "002167be", "002167be", "002167b…
## $ date              <date> NA, NA, 2009-10-12, 2009-10-12, 2009-10-12, 2009-10…
## $ shift_num         <chr> NA, NA, "96", "96", "96", "96", "96", "96", "96", "9…
## $ shift_ampm        <fct> NA, NA, am, am, am, am, am, am, am, am, am, am, am, …
## $ encounter_no2     <chr> NA, NA, "38229285", "38229285", "38229285", "3822928…
## $ pt_id             <chr> NA, NA, "3", "3", "3", "3", "3", "3", "3", "3", "3",…
## $ age2              <dbl> NA, NA, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, 57, …
## $ arrival_dttm      <dttm> NA, NA, 2009-10-12 03:49:59, 2009-10-12 03:49:59, 2…
## $ arrival_tm        <time>              NA,              NA, 03:50:00.000000, …
## $ departure_dttm    <dttm> NA, NA, 2009-10-12 09:07:00, 2009-10-12 09:07:00, 2…
## $ departure_tm      <time>       NA,       NA, 09:07:00, 09:07:00, 09:07:00, 0…
## $ time2tag          <dbl> NA, NA, 197, 197, 197, 197, 197, 197, 197, 197, 197,…
## $ start_date        <dttm> NA, NA, 2009-10-12 07:15:00, 2009-10-12 07:15:00, 2…
## $ end_date          <dttm> NA, NA, 2009-10-12 19:00:00, 2009-10-12 19:00:00, 2…
## $ los               <dbl> NA, NA, 317.7333, 317.7333, 317.7333, 317.7333, 317.…
## $ los_minutes       <dbl> NA, NA, 317, 317, 317, 317, 317, 317, 317, 317, 317,…
## $ los_hours         <dbl> NA, NA, 5.283333, 5.283333, 5.283333, 5.283333, 5.28…
## $ job               <chr> NA, NA, "PATIENT", "PATIENT", "PATIENT", "PATIENT", …
## $ approached        <fct> NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ participant       <fct> NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ pt_with_data      <fct> NA, NA, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes, Yes,…
## $ participant_final <fct> NA, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ location_num      <fct> NA, NA, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, …
## $ shift_second      <dbl> NA, NA, 901, 902, 903, 904, 905, 906, 907, 908, 909,…
# staff_all == staff_locs_nested
staff_all <- read_rds(paste0(rdata_dir, "staff_all.Rdata"))
# glimpse(staff_all)

all_staff <- staff_all %>%
    unnest(cols = nested_loc, keep_empty = TRUE)
glimpse(all_staff)
## Rows: 58,373,401
## Columns: 9
## Groups: sid, shift_num [2,218]
## $ sid             <chr> "0002f4e2", "0002f4e2", "0002f4e2", "0002f4e2", "0002f…
## $ date            <date> 2009-07-09, 2009-07-09, 2009-07-09, 2009-07-09, 2009-…
## $ shift_num       <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1",…
## $ shift_ampm      <fct> pm, pm, pm, pm, pm, pm, pm, pm, pm, pm, pm, pm, pm, pm…
## $ location_num    <fct> 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91, 91…
## $ shift_second    <dbl> 3622, 3623, 3624, 3625, 3626, 3627, 3628, 3629, 3630, …
## $ job_sah         <fct> STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF…
## $ participant_cat <fct> STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF…
## $ jobtitle        <fct> STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF, STAFF…
# what file format do data need to for uploading to google cloud???
# write_rds(all_pts, file = 'all_pts.Rdata') write_rds(all_staff, file =
# 'all_staff.Rdata')
# directory path
rdata_dir <- "/Users/tommy-two/Documents/1_Research/2_Data_Science/0_Projects/1_NACI/Data/Rdata"
rdata_archive <- paste0(rdata_dir, "/rds_archive")
# rdata_dir
# WRANGLING----

## X--ntab2_final----
ntab2_final <- read_rds(paste0(rdata_dir, "/ntab2_final.Rdata"))
# view loaded data
ntab2_final %>%
  glimpse()
# ntab2_clean <- ntab2_clean %>%
#   mutate(shift_num = as.character(shift_num),
#          shift_person = as.character(shift_person),
#          sid = format(sid, scientific = FALSE))
# ntab2_clean <- ntab2_clean %>%
#   relocate(shift_person, .after = id) %>%
#   mutate(sid = str_trim(sid))
# ntab2_nms <- names(ntab2_clean)

# Duplicate sid for two patients in the same shift (shift 91; 10/07/2009):
# ntab2_final %>%
#   mutate(sid = (sid)) %>%
#   group_by(shift_num) %>%
#   count(sid) %>%
#   filter(n > 1)
# one duplicate sid from shift 91:
# ntab2_final %>%
#   filter(shift_num == "91" & sid == "2.17E+04")
# id_sid_final %>%
#   glimpse()
# ntab2_final %>%
#   # filter(str_detect(sid, "^2")) %>%
#   filter(shift_num == "8") %>%
#   count(sid) %>%
#   arrange(desc(n)) #%>%
#   filter(str_detect(sid, "2\\.17"))
# id_sid_final %>%
#   # glimpse()
#   group_by(shift_num) %>%
#   count(sid) %>%
#   arrange(desc(n)) %>%
#   filter(str_detect(sid, "217"))
#   filter(str_detect(sid, "217"))
#   mutate(staff = if_else(staff == 1, "Staff", "Patient"))
# write_rds(ntab2_clean, file = paste0(rdata_dir, "/ntab2_final.Rdata"))
# rm(ntab2_clean)

## X--id_sid_clean----
id_sid_clean <- read_rds(paste0(rdata_dir, "/id_sid_clean.Rdata"))
# view loaded data
# id_sid_clean %>%
#   glimpse()
# compare to original file: "id_sid_matchup2.sas7bdat"
# idsid_matchup <- read_sas("Data/ids/id_sid_matchup2.sas7bdat")
id_sid_clean %>%
  glimpse()
# idsid_matchup %>%
  # glimpse()
  # group_by(numshift) %>%
  # count(newsid) %>%
  # filter(n > 1)
  # summarise(max(newsid))

id_sid_clean <- id_sid_clean %>%
  rename(shift_ampm = am_pm,
         shift_person = newsid) # %>%
# glimpse()
# write_rds(id_sid_clean, paste0(rdata_dir, "/id_sid_final.Rdata"))
# rm(id_sid_clean)

id_sid_clean %>%
  group_by(shift_num) %>%
  count(sid) %>%
  arrange(desc(n))
# **ids_final ------
id_sid_final <- read_rds(paste0(rdata_dir, "/id_sid_final.Rdata"))
id_sid_final %>%
  group_by(staff) %>%
  count(shift_num) %>%
  ungroup() %>%
  pivot_wider(names_from = staff, values_from = n) %>%
  transmute(`Shift Number` = as.numeric(shift_num),
            `Patients (n)` = `0`,
            `Staff (n)` = `1`) %>%
  arrange(`Shift Number`) %>%
  rownames_to_column() %>%
  mutate(`Shift Number` = rowname,
         .keep = "unused") %>%
  relocate(`Shift Number`, `Patients (n)`, `Staff (n)`) %>%
  mutate(`Total (n)` = `Patients (n)` + `Staff (n)`)


id_sid_final %>%
  glimpse()
## **staff_sids----
# staff_sids <- read_csv(file = "Data/csv_files/staff_sids_clean.csv",
#                        col_names = c("sid", "job_sah", "participant_cat", "jobtitle"),
#                        col_types = "cfff",
#                        skip = 1)
# Read saved R object from file = "staff_sids_clean.Rdata"
staff_sids <- read_rds(paste0(rdata_dir, "/staff_sids-5_1_22.Rdata"))
staff_sids %>%
  # glimpse()
  count(sid) %>%
  arrange(desc(n))
# write_rds(staff_sids, paste0(rdata_dir, "/staff_sids_final.Rdata"))
# rm(staff_sids)

# no duplicates in staff sid. 88 total staff members.
staff_sids_final <- read_rds(paste0(rdata_dir, "/staff_sids_final.Rdata"))
staff_sids_final %>%
  glimpse()

## **pt_sids----
pt_sid_clean <- read_rds(paste0(rdata_dir, "/pt_sid_clean.Rdata"))
# view loaded data
shifts_vec <- pt_sid_clean %>%
  mutate(shift_num = as.integer(shift_num)) %>%
  arrange(shift_num) %>%
  pull(shift_num) %>%
  unique() %>%
  as.character()
# set seed for reproducibility:
set.seed(711)
# select a random sample of
random_shifts <- sample(shifts_vec, 20)

# many duplicate sids:
pt_sid_clean %>%
  count(sid) %>%
  filter(n > 1) %>%
  arrange(desc(n))
# no duplicate sids when grouped by shift:
pt_sid_clean %>%
  group_by(shift_num) %>%
  count(sid) %>%
  arrange(desc(n))
# write_rds(pt_sid_clean, paste0(rdata_dir, "/pt_sid_final.Rdata"))
# rm(pt_sid_clean)


## **population (from demographics_clean)----
demographics_clean <- read_rds(paste0(rdata_dir, "/demographics_clean.Rdata"))
demographics_clean %>%
  glimpse()
# saved as population.Rdata
# write_rds(demographics_clean, paste0(rdata_dir, "/population.Rdata"))

## **pts_pop_clean----
pts_population <- read_rds(paste0(rdata_archive, "/pts_population.Rdata"))
pts_pop_clean <- pts_population %>%
  mutate(sid,
         encounter_no = str_trim(EncounterNum),
         encounter_no2 = as.character(encnum),
         pt_id = as.character(ID),
         date = d8,
         shift_num = as.character(numshift),
         shift_ampm = factor(shift_ampm),
         sex = factor(Sex),
         age = as.numeric(Patient_Age_at_Visit),
         age2 = AGE,
         race = factor(Race,
                       levels = c("Black",
                                  "White",
                                  "Hispanic",
                                  "Other"),
                       exclude = c("Not Recorded", "1", "2", "3")),
         chief_complaint = Chief_Complaint,
         acuity = factor(Acuity,
                         levels = c("1 Immediate",
                                    "2 Emergent",
                                    "3 Urgent",
                                    "4 Stable",
                                    "5 Non Urgent"),
                         exclude = "Not Recorded"),
         arrival_dttm = ED_Arrival_Timestamp,
         arrival_tm = timeARR,
         arrival_mode = factor(Arr_Mode, exclude = "Not Recorded"),
         arrival_cat = fct_recode(arrival_mode,
                                  "EMS" = "EMS Ground",
                                  "EMS" = "EMS Air",
                                  "Custody" = "Police/Jail",
                                  "Ambulatory" = "Private Vehicle",
                                  "Ambulatory" = "Public Trans",
                                  "Ambulatory" = "Walk"),
         departure_dttm = ED_Departure_Timestamp,
         departure_tm = timeDEP,
         time2tag = duration_UntilTag,
         start_date = startd8time,
         end_date = endd8time,
         los = durationInED,
         los_minutes = MinutesInED,
         los_hours = hrsinED,
         disposition = ED_Disposition,
         job = JOB,
         approached = factor(approached),
         participant = factor(participantyn),
         pt_with_data = factor(Pt_withData),
         # levels = c("Yes" = 1,
         #            "No" = 0)),
         participant_final = factor(Participant_final),
         .keep = "none") %>%
  mutate(across(where(is.factor), ~na_if(., ""))) %>%
  mutate(across(where(is.character), ~na_if(., ""))) %>%
  mutate(across(where(is.factor), ~na_if(., "Not Recorded"))) %>%
  mutate(across(where(is.character), ~na_if(., "Not Recorded"))) %>%
  relocate(shift_ampm, .after = date) %>%
  relocate(arrival_cat, .after = arrival_mode) %>%
  relocate(approached, .before = participant)

# check results
pts_pop_clean %>%
  glimpse()
  # filter(encounter_no %in% demographics_clean$encounter_no)
  count(shift_num)

  # filter(encounter_no %in% pts_pop_clean$encounter_no)
# write pts_pop to rds file:
# write_rds(pts_pop, paste0(rdata_dir, "/pts_pop_clean.Rdata"))
# rm(pts_pop)

## X--allstaff_clean----
allstaff_clean <- read_rds(paste0(rdata_dir, "/allstaff_clean.Rdata"))
allstaff_clean %>%
  glimpse()

## **pt_complete----
# pt_complete <- read_rds(paste0(rdata_dir, "/pt_complete_long.Rdata"))
# view loaded data
# pt_comp <- pt_complete %>%
#   mutate(shift_num = as.character(shift_num),
#          day_night = NULL) %>%
#   rename(shift_ampm = am_pm)
pt_comp_nested <- read_rds(paste0(rdata_dir, "/pt_comp_nested-5_2_22.Rdata"))
# create unnested version
pt_comp_unnested <- pt_comp_nested %>%
  unnest(cols = c(data))
# view unnested version
glimpse(pt_comp_unnested)
# ungroup unnested df
pt_comp_unnested %>%
  ungroup() %>%
  count(date) %>%
  arrange(date) %>%
  plot(date, n)
# renest df by sid & shift_num only
pt_comp_nested <- pt_comp_unnested %>%
  group_by(sid, shift_num) %>%
  nest(nested_loc = c(location_num, shift_second)) # %>%
# check for duplicates:
# pt_comp has many duplicate sids:
pt_comp %>%
  count(sid) %>%
  filter(n > 1)

# pt_comp_nested has no duplicate sids
  pt_comp_nested %>%
    # glimpse() %>%
    count(sid) %>%
    filter(n > 1)
# no duplicates found
# glimpse(pt_comp_unnested)
glimpse(pt_comp_nested)
# write_rds(pt_comp_nested, file = paste0(rdata_dir, "/pt_locs_nested.Rdata"))
# rm(pt_comp_nested)

## **staff_complete----
staff_complete <- read_rds(paste0(rdata_dir, "/staff_complete_long.Rdata"))

staff_comp <- staff_complete %>%
  mutate(shift_num = as.character(shift_num),
         firstday = NULL) %>%
  rename(shift_ampm = am_pm,
         shift_second = seconds) %>%
  relocate(sid, date, shift_num, shift_ampm, location_num, shift_second)
# exclude from staff_sids all rows with sid not in staff_complete
stf_comp_sids <- staff_complete %>%
  distinct(sid)
stf_sids_sids <- staff_sids %>%
  select(sid)
excluded_staff_sids <- staff_sids %>%
  filter(!(stf_sids_sids$sid %in% stf_comp_sids$sid)) %>%
  pull(sid)
staff_sids_88 <- staff_sids %>%
  filter(!(sid %in% excluded_staff_sids))

# write_rds(staff_sids_88, file = paste0(rdata_dir, "/staff_sids-5_1_22.Rdata"))
# write_csv(staff_sids_88, file = paste0("Data/csv_files/staff_sids-5_1_22"))
staff_comp %>%
  glimpse()

# staff_comp has many duplicated sid numbers, but not within shifts:

staff_comp %>%
  count(sid) %>%
  filter(n > 1)



staff_comp_nested <- staff_comp %>%
  group_by(sid, shift_num) %>%
  nest(nested_loc = c(location_num, shift_second)) # %>%
# staff_comp_nested has no duplicated sids
# the following table shows number of observed staff (n) per shift, totals of staff working per shift is not available
staff_comp_nested %>%
  ungroup() %>%
  count(shift_num) %>%
  arrange(desc(n))
  # filter(n > 1)
# to unnest staff_comp_nested, use:
staff_comp_nested %>%
  unnest(cols = c(nested_loc))
# view glimpse() of staff_comp_nested:
staff_comp_nested %>%
  glimpse()
write_rds(staff_comp_nested, paste0(rdata_dir, "/staff_locs_nested.Rdata"))
rm(staff_comp, staff_comp_nested)
rm(staff_complete)
## **events_clean----
# load eventfile_nodup.Rdata
events <- read_rds(paste0(rdata_dir, "/eventfile_nodup.Rdata"))

# number of staff: 88
# events %>%
#   filter(ParticipantCat2 == "STAFF") %>%
#   group_by(sid) %>%
#   count() %>%
#   ungroup() %>%
#   summarise(n_staff = length(sid))



# clean events:
events_clean <- events %>%
  mutate(sid = sid,
         pt_id = patient,
         encounter_no = str_trim(EncounterNum),
         participant_cat = factor(ParticipantCat4),
         job_cat = factor(job),
         job_name = if_else(jobfullname == "", "Patient", jobfullname),
         date = ymd(d8),
         shift_num = shift_num,
         shift_start_dttm = startd8time,
         shift_end_dttm = endd8time,
         tag_on_dttm = tag_starttime,
         tag_off_dttm = tagofftime,
         event_id = factor(event),
         event_dttm = ymd_hms(et),
         event_tm = ti,
         room_index = locindex,
         room_name = room,
         .keep = "none",
         .after = "sid") %>%
  relocate(pt_id, date, shift_num, participant_cat, .after = encounter_no)

# write_rds(events_clean, paste0(rdata_dir, "/events_final.Rdata"))
# rm(events_clean, events)
pt_id_vec <- events_clean %>%
  filter(pt_id %in% pts_pop_clean$id_no) %>%
  pull(pt_id)
# c(pt_id_vec)
pt_id_vec <- pt_id_vec %>%
  # group_by(pt_id) %>%
  distinct(pt_id) %>%
  pull(pt_id)

pts_pop_clean %>%
  filter(id_no %in% pt_id_vec$pt_id)

events_clean %>%
  filter(pt_id %in% pt_id_vec$pt_id)

events_cl_nested <- events_clean %>%
  group_by(shift_num, sid, encounter_no, date) %>%
  nest()


# JOINING DFs:-----
# Read saved data objects:
id_sid <- read_rds(paste0(rdata_dir, "/id_sid_final.Rdata"))
ntab2 <- read_rds(paste0(rdata_dir, "/ntab2_final.Rdata"))
pt_info <- read_rds(paste0(rdata_dir, "/pt_info_final.Rdata"))
pt_sids <- read_rds(paste0(rdata_dir, "/pt_sid_final.Rdata"))
pts_nested <- read_rds(paste0(rdata_dir, "/pt_locs_nested.Rdata"))
staff_nested <- read_rds(paste0(rdata_dir, "/staff_locs_nested.Rdata"))
staff_sids <- read_rds(paste0(rdata_dir, "/staff_sids_final.Rdata"))
events <- read_rds(paste0(rdata_dir, "/events_final.Rdata"))

## staff_all <-  staff_nested + staff_sids =====
staff_all <- staff_nested %>%
  left_join(staff_sids, by = "sid")

staff_all %>%
  glimpse()



# write_rds(staff_all, paste0(rdata_dir, "/staff_all.Rdata"))

# pts_all <- pts_nested + pt_info----

pt_info %>%
  glimpse()
# names(pts_nested)
# names(pt_info)
# test <- pts_nested %>%
#   full_join(pt_info, by = c("sid",
#                             "date",
#                             "shift_num",
#                             "shift_ampm"))

# glimpse(test)
# pts_nested %>%
  # glimpse()
# move list column to last position
pts_all <- pts_all %>%
  ungroup() %>%
  relocate(nested_loc, .after = participant_final)
pts_all %>%
  glimpse()

# pts_all_unnested <- pts_all %>%
#   unnest(cols = nested_loc) #%>%

# save pts_all_unnested as .csv & .Rdata files:
# write_csv(pts_all_unnested, paste0("Data/csv_files/pts_all_unnested.csv"))
# write_rds(pts_all, paste0(rdata_dir, "/pts_all.Rdata"))






# SLICES:----

sliced_pts <- pts_all %>%
  unnest(cols = c(nested_loc)) %>%
  group_by(sid, shift_num) %>%
  slice_sample(prop = 0.005)

# sliced_pts %>%
#   ungroup() %>%
#   group_by(encounter_no)# %>%
# summarise(n_pts = n(),
#           n_participants = sum(as.numeric(participant_final), na.rm = TRUE))


# sliced_pts %>%
#   # glimpse()
#   ungroup() %>%
#   count(sid) %>%
#   summarise(sliced_pts_n = n(), sliced_obs_n = sum(n))
# summarise(sum(n))

EXPLORATION

# SETUP ---- create path to data archive
rdata_dir <- "/Users/tommy-two/Documents/1_Research/2_Data_Science/0_Projects/1_NACI/Data/Rdata"
rdata_archive <- paste0(rdata_dir, "/rds_archive")
## load packages library(naniar) # for missing data library(lubridate)
library(labelled)
library(vtable)  # good package for summary tables
library(tidyverse)
## import data sets
population_730 <- read_rds(paste0(rdata_dir, "/population.Rdata"))
n_patients_730 <- population_730 %>%
    nrow()
# n_patients_730
population_81 <- read_rds(paste0(rdata_dir, "/pt_info_final.Rdata"))
n_patients_81 <- population_81 %>%
    nrow()
# n_patients_81

# Make a table of patient & staff counts (n) per shift
# (shift_counts_table.csv)----
id_sid_final <- read_rds(paste0(rdata_dir, "/id_sid_final.Rdata"))
shift_counts_table <- id_sid_final %>%
    group_by(staff) %>%
    count(shift_num) %>%
    ungroup() %>%
    pivot_wider(names_from = staff, values_from = n) %>%
    transmute(`Shift Number` = as.numeric(shift_num), `Patients (n)` = `0`, `Staff (n)` = `1`) %>%
    arrange(`Shift Number`) %>%
    rownames_to_column() %>%
    mutate(`Shift Number` = rowname, .keep = "unused") %>%
    relocate(`Shift Number`, `Patients (n)`, `Staff (n)`) %>%
    mutate(`Total (n)` = `Patients (n)` + `Staff (n)`)
# write_csv(shift_counts_table, paste0(getwd(),
# '/Tables/shift_counts_table.csv'))

SessionInfo

sessioninfo::session_info() %>%
    details::details(summary = "Current session info", open = FALSE)
Current session info

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.1.2 (2021-11-01)
 os       macOS Big Sur 10.16
 system   x86_64, darwin17.0
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2022-05-31
 pandoc   2.17.1.1 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version date (UTC) lib source
 assertthat      0.2.1   2019-03-21 [1] CRAN (R 4.1.0)
 backports       1.4.1   2021-12-13 [1] CRAN (R 4.1.0)
 bit             4.0.4   2020-08-04 [1] CRAN (R 4.1.0)
 bit64           4.0.5   2020-08-30 [1] CRAN (R 4.1.0)
 broom           0.8.0   2022-04-13 [1] CRAN (R 4.1.2)
 broom.helpers   1.7.0   2022-04-22 [1] CRAN (R 4.1.2)
 bslib           0.3.1   2021-10-06 [1] CRAN (R 4.1.0)
 cellranger      1.1.0   2016-07-27 [1] CRAN (R 4.1.0)
 cli             3.3.0   2022-04-25 [1] CRAN (R 4.1.2)
 clipr           0.8.0   2022-02-22 [1] CRAN (R 4.1.2)
 colorspace      2.0-3   2022-02-21 [1] CRAN (R 4.1.2)
 crayon          1.5.1   2022-03-26 [1] CRAN (R 4.1.2)
 data.table    * 1.14.2  2021-09-27 [1] CRAN (R 4.1.0)
 DBI             1.1.2   2021-12-20 [1] CRAN (R 4.1.0)
 dbplyr          2.1.1   2021-04-06 [1] CRAN (R 4.1.0)
 desc            1.4.1   2022-03-06 [1] CRAN (R 4.1.2)
 details         0.3.0   2022-03-27 [1] CRAN (R 4.1.2)
 digest          0.6.29  2021-12-01 [1] CRAN (R 4.1.0)
 dplyr         * 1.0.9   2022-04-28 [1] CRAN (R 4.1.2)
 ellipsis        0.3.2   2021-04-29 [1] CRAN (R 4.1.0)
 evaluate        0.15    2022-02-18 [1] CRAN (R 4.1.2)
 fansi           1.0.3   2022-03-24 [1] CRAN (R 4.1.2)
 fastmap         1.1.0   2021-01-25 [1] CRAN (R 4.1.0)
 forcats       * 0.5.1   2021-01-27 [1] CRAN (R 4.1.0)
 formatR         1.12    2022-03-31 [1] CRAN (R 4.1.2)
 fs              1.5.2   2021-12-08 [1] CRAN (R 4.1.0)
 generics        0.1.2   2022-01-31 [1] CRAN (R 4.1.2)
 ggplot2       * 3.3.6   2022-05-03 [1] CRAN (R 4.1.2)
 glue            1.6.2   2022-02-24 [1] CRAN (R 4.1.2)
 gt              0.5.0   2022-04-21 [1] CRAN (R 4.1.2)
 gtable          0.3.0   2019-03-25 [1] CRAN (R 4.1.0)
 gtsummary     * 1.6.0   2022-04-25 [1] CRAN (R 4.1.2)
 haven         * 2.5.0   2022-04-15 [1] CRAN (R 4.1.2)
 hms             1.1.1   2021-09-26 [1] CRAN (R 4.1.0)
 htmltools       0.5.2   2021-08-25 [1] CRAN (R 4.1.0)
 httr            1.4.3   2022-05-04 [1] CRAN (R 4.1.2)
 jquerylib       0.1.4   2021-04-26 [1] CRAN (R 4.1.0)
 jsonlite        1.8.0   2022-02-22 [1] CRAN (R 4.1.2)
 kableExtra    * 1.3.4   2021-02-20 [1] CRAN (R 4.1.0)
 knitr         * 1.39    2022-04-26 [1] CRAN (R 4.1.2)
 labelled      * 2.9.1   2022-05-05 [1] CRAN (R 4.1.2)
 lifecycle       1.0.1   2021-09-24 [1] CRAN (R 4.1.0)
 lubridate     * 1.8.0   2021-10-07 [1] CRAN (R 4.1.0)
 magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.1.2)
 modelr          0.1.8   2020-05-19 [1] CRAN (R 4.1.0)
 munsell         0.5.0   2018-06-12 [1] CRAN (R 4.1.0)
 pillar          1.7.0   2022-02-01 [1] CRAN (R 4.1.2)
 pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.1.0)
 png             0.1-7   2013-12-03 [1] CRAN (R 4.1.0)
 purrr         * 0.3.4   2020-04-17 [1] CRAN (R 4.1.0)
 R6              2.5.1   2021-08-19 [1] CRAN (R 4.1.0)
 readr         * 2.1.2   2022-01-30 [1] CRAN (R 4.1.2)
 readxl        * 1.4.0   2022-03-28 [1] CRAN (R 4.1.2)
 reprex          2.0.1   2021-08-05 [1] CRAN (R 4.1.0)
 rlang           1.0.2   2022-03-04 [1] CRAN (R 4.1.2)
 rmarkdown       2.14    2022-04-25 [1] CRAN (R 4.1.2)
 rprojroot       2.0.3   2022-04-02 [1] CRAN (R 4.1.2)
 rstudioapi      0.13    2020-11-12 [1] CRAN (R 4.1.0)
 rvest           1.0.2   2021-10-16 [1] CRAN (R 4.1.0)
 sass            0.4.1   2022-03-23 [1] CRAN (R 4.1.2)
 scales          1.2.0   2022-04-13 [1] CRAN (R 4.1.2)
 sessioninfo     1.2.2   2021-12-06 [1] CRAN (R 4.1.0)
 stringi         1.7.6   2021-11-29 [1] CRAN (R 4.1.0)
 stringr       * 1.4.0   2019-02-10 [1] CRAN (R 4.1.0)
 svglite         2.1.0   2022-02-03 [1] CRAN (R 4.1.2)
 systemfonts     1.0.4   2022-02-11 [1] CRAN (R 4.1.2)
 tibble        * 3.1.7   2022-05-03 [1] CRAN (R 4.1.2)
 tidyr         * 1.2.0   2022-02-01 [1] CRAN (R 4.1.2)
 tidyselect      1.1.2   2022-02-21 [1] CRAN (R 4.1.2)
 tidyverse     * 1.3.1   2021-04-15 [1] CRAN (R 4.1.0)
 tzdb            0.3.0   2022-03-28 [1] CRAN (R 4.1.2)
 utf8            1.2.2   2021-07-24 [1] CRAN (R 4.1.0)
 vctrs           0.4.1   2022-04-13 [1] CRAN (R 4.1.2)
 viridisLite     0.4.0   2021-04-13 [1] CRAN (R 4.1.0)
 vroom           1.5.7   2021-11-30 [1] CRAN (R 4.1.0)
 webshot         0.5.3   2022-04-14 [1] CRAN (R 4.1.2)
 withr           2.5.0   2022-03-03 [1] CRAN (R 4.1.2)
 xfun            0.30    2022-03-02 [1] CRAN (R 4.1.2)
 xml2            1.3.3   2021-11-30 [1] CRAN (R 4.1.0)
 yaml            2.3.5   2022-02-21 [1] CRAN (R 4.1.2)

 [1] /Library/Frameworks/R.framework/Versions/4.1/Resources/library

──────────────────────────────────────────────────────────────────────────────